{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from scipy.sparse import hstack\n",
    "from sklearn.cross_validation import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def textcleaning(string):\n",
    "    string = re.sub('http\\S+|www.\\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))\n",
    "    string = unidecode(string).replace('.', '. ').replace(',', ', ')\n",
    "    string = re.sub('[^\\'\\\"A-Za-z\\- ]+', ' ', string)\n",
    "    return ' '.join([i for i in re.findall(\"[\\\\w']+|[;:\\-\\(\\)&.,!?\\\"]\", string) if len(i)>1]).lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(40911, 7)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('toxic-bm.csv')\n",
    "df = df.dropna()\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(df.shape[0]):\n",
    "    df.iloc[i,0] = textcleaning(df.iloc[i,0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vectorizer = TfidfVectorizer(\n",
    "    sublinear_tf=True,\n",
    "    strip_accents='unicode',\n",
    "    analyzer='word',\n",
    "    token_pattern=r'\\w{1,}',\n",
    "    stop_words='english',\n",
    "    ngram_range=(1, 1),\n",
    "    max_features=10000)\n",
    "word_vectorizer.fit(df.iloc[:,0])\n",
    "word_features = word_vectorizer.transform(df.iloc[:,0])\n",
    "\n",
    "char_vectorizer = TfidfVectorizer(\n",
    "    sublinear_tf=True,\n",
    "    strip_accents='unicode',\n",
    "    analyzer='char',\n",
    "    stop_words='english',\n",
    "    ngram_range=(2, 6),\n",
    "    max_features=50000)\n",
    "char_vectorizer.fit(df.iloc[:,0])\n",
    "char_features = char_vectorizer.transform(df.iloc[:,0])\n",
    "\n",
    "features = hstack([char_features, word_features])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(features, df, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "toxic\n",
      "severe_toxic\n",
      "obscene\n",
      "threat\n",
      "insult\n",
      "identity_hate\n"
     ]
    }
   ],
   "source": [
    "class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
    "\n",
    "models = []\n",
    "predicted = pd.DataFrame.from_dict({'id': np.arange(len(test_Y))})\n",
    "for class_name in class_names:\n",
    "    print(class_name)\n",
    "    train_target = train_Y[class_name]\n",
    "    classifier = LogisticRegression(C=0.1, solver='sag')\n",
    "    classifier.fit(train_X, train_target)\n",
    "    predicted[class_name] = classifier.predict_proba(test_X)[:, 1]\n",
    "    models.append(classifier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('logistics.pkl','wb') as fopen:\n",
    "    pickle.dump(models, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.98      0.27      0.43       805\n",
      "          1       0.50      0.02      0.04        88\n",
      "          2       0.99      0.30      0.46       460\n",
      "          3       0.00      0.00      0.00        32\n",
      "          4       0.87      0.22      0.35       420\n",
      "          5       0.00      0.00      0.00        68\n",
      "\n",
      "avg / total       0.88      0.24      0.38      1873\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "print(metrics.classification_report(test_Y.iloc[:,1:],np.around(predicted.iloc[:,1:].values)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "toxic\n",
      "severe_toxic\n",
      "obscene\n",
      "threat\n",
      "insult\n",
      "identity_hate\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.81      0.52      0.63       805\n",
      "          1       0.44      0.35      0.39        88\n",
      "          2       0.76      0.49      0.59       460\n",
      "          3       0.00      0.00      0.00        32\n",
      "          4       0.68      0.47      0.56       420\n",
      "          5       0.15      0.09      0.11        68\n",
      "\n",
      "avg / total       0.71      0.47      0.56      1873\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
    "\n",
    "models = []\n",
    "predicted = pd.DataFrame.from_dict({'id': np.arange(len(test_Y))})\n",
    "for class_name in class_names:\n",
    "    print(class_name)\n",
    "    train_target = train_Y[class_name]\n",
    "    classifier = MultinomialNB()\n",
    "    classifier.fit(train_X, train_target)\n",
    "    predicted[class_name] = classifier.predict_proba(test_X)[:, 1]\n",
    "    models.append(classifier)\n",
    "    \n",
    "print(metrics.classification_report(test_Y.iloc[:,1:],np.around(predicted.iloc[:,1:].values)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('multinomials.pkl','wb') as fopen:\n",
    "    pickle.dump(models, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('vectorizer.pkl','wb') as fopen:\n",
    "    pickle.dump({'word':word_vectorizer,'char':char_vectorizer}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
